************************************* 
*************** SETUP *************** 
************************************* 

// Record Linkage for Character-Based Names
// 1900-1910 matches
// Author: Hannah Postel
// Date: 10/26/2022


**************************************
************* STEP ZERO **************
********** BASIC 1910 CLEAN **********
*********** BASELINE MATCH ***********
************************************** 

use "$data/1910_chinese.dta", clear

*** SPECIFY MISSING NAMES ***

// N=16
replace namefrst="." if namefrst=="" | namefrst=="UNKNOWN" | namefrst=="<BLANK>"
// N=16
replace namelast="." if namelast=="" | namelast=="UNKNOWN" | namelast=="<BLANK>"

*** REPLACE SPECIALS WITH SPACES/REMOVE ***
replace namefrst = subinstr(namefrst, "-", " ",.)
replace namefrst = subinstr(namefrst, "(", "",.) 
replace namefrst = subinstr(namefrst, ")", "",.) 
replace namefrst = subinstr(namefrst, "?", "",.)
replace namefrst = subinstr(namefrst, "*", "",.) 
replace namefrst = subinstr(namefrst, ",", "",.)
replace namefrst = subinstr(namefrst, "'", "",.) 
replace namefrst = subinstr(namefrst, ".", "",.) if namefrst!="."

replace namelast = subinstr(namelast, "-", " ",.) 
replace namelast = subinstr(namelast, "*", "",.) 
replace namelast = subinstr(namelast, "(", "",.) 
replace namelast = subinstr(namelast, ")", "",.) 
replace namelast = subinstr(namelast, "?", "",.) 
replace namelast = subinstr(namelast, ",", "",.)
replace namelast = subinstr(namelast, "'", " ",.)
replace namelast = subinstr(namelast, ">", "",.)
replace namelast = subinstr(namelast, ".", "",.) if namelast!="."

*** TRIM ADDITIONAL SPACES ***
// Internal
replace namefrst = stritrim(namefrst)
replace namelast = stritrim(namelast)
// External (leading and trailing)
replace namefrst = strtrim(namefrst)
replace namelast = strtrim(namelast)

*** FORMATTING "AH" HONORIFIC ***
replace namefrst = subinstr(namefrst, "A H", "AH",.)
replace namefrst = subinstr(namefrst, "A ", "AH ",.)
replace namefrst = "AH" if namefrst=="A"
replace namefrst = "AH" if namefrst=="AA"
replace namefrst = subinstr(namefrst, "AAH ", "AH ",.)

*** DROP NON-REAL NAMES ***
// Most with "China" as a name fragment have either 1 or 0 additional fragments
// Clean those that do have full names
replace namefrst="CHARLEY" if namefrst=="CHINAH CHARLEY"
replace namelast="" if namefrst=="CHINN DING CHEE"

drop if strpos(namelast, "CHINA") > 0  // 37 obs
drop if strpos(namefrst, "CHINA") > 0 // 14 obs

*** STANDARDIZE AMERICANIZED SPELLINGS ***
replace namefrst="CHARLIE" if namefrst=="CHARLEY" | namefrst=="CHARLY" | namefrst=="CHARLI" 
replace namefrst="CHARLES" if namefrst=="CHAS."| namefrst=="CHAS"
replace namelast="CHARLIE" if namelast=="CHARLEY" | namelast=="CHARLY" | namelast=="CHARLIS"

save "$data/1910_chn_clean.dta", replace


*** MATCHES WITH BASIC CLEANING ***

// Set up ABE matching algorithm
	global A "$data/1900_chn_clean.dta" 				
	global B "$data/1910_chn_clean.dta"	
	global match_vars namefrst namelast // census names as written
	global timediff = (1910 - 1900)
	
// Standard ABE match. N=4298.
	frame change default
	clear
	cd "$data"
	abematch $match_vars, file_A($A) file_B($B) timevar(age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(histid_1900) keep_B(histid_1910)

	drop timediff* age
	ren *_A *
	ren *_B *
	gen step = "0" // identify these as base matches
	ren * _*
	frame copy default matches, replace // creating the 'match' frame
	frame matches: gen _name1="."
	frame matches: gen _name2="."
	frame matches: gen _name3="."
	frame matches: gen _name4="."
	
***************************************
*************** STEP ONE **************
********** POST-SEGMENTATION **********
***************************************

*** 1910 NAME SEGMENTATION ***

use "$data/1910_chn_clean.dta", clear

replace namefrst="" if namefrst=="."
replace namelast="" if namelast=="."

// variables noting the number of spaces
gen nspace_frst = length(namefrst) - length(subinstr(namefrst," ", "", .))
gen nchar_frst=(nspace_frst+1) if namefrst!=""
replace nchar_frst=0 if nchar_frst==.

gen nspace_last = length(namelast) - length(subinstr(namelast," ","", .))
gen nchar_last=(nspace_last+1) if namelast!=""
replace nchar_last=0 if nchar_last==.

gen nspace_tot = nspace_frst + nspace_last
gen nchar_tot = nchar_frst + nchar_last

// "first" names
split namefrst, gen(name)

// fix two character surnames
replace name1="SOOHOO" if ((name1=="SOO" & name2=="HOO") | (name1=="SEE" & name2=="HOO") | (name1=="SOO" & name2=="HO") | (name1=="SEE" & name2=="TOO"))
replace name1= "OWYANG" if ((name1=="OW" & name2=="YOUNG") | (name1=="OW" & name2=="YANG") | (name1=="OU" & name2=="YONG") | (name1=="OW" & name2=="YUNG"))
replace name2=name3 if (name1=="SOOHOO" | name1=="OWYANG")
replace name3=name4 if (name1=="SOOHOO" | name1=="OWYANG")

// fill in blanks with missing dot (working to "fill" all blank spots)
replace name2="." if namelast=="" & name2==""
replace name3="." if namelast=="" & name3==""
replace name4="." if namelast=="" & name4==""

// "last" names
split namelast, gen(othername)
order namefrst name1-name4 namelast othername1-othername4

// fix two character surnames
replace othername1="SOOHOO" if ((othername1=="SEE" & othername2=="HE") | (othername1=="SEE" & othername2=="HO") | (othername1=="SO" & othername2=="HO") | (othername1=="SO" & othername2=="HOO") | (othername1=="SOO" & othername2=="HO") | (othername1=="SOO" & othername2=="HOO"))
replace othername1= "OWYANG" if ((othername1=="OW" & othername2=="YAN") | (othername1=="OW" & othername2=="YANG"))

replace othername2=othername3 if (othername1=="SOOHOO" | othername1=="OWYANG")
replace othername3="" if (othername1=="SOOHOO" | othername1=="OWYANG")

// filling in blanks
replace name1=othername1 if name1=="" & othername1!=""

// easy fill - one character name in both locations
replace name2=othername1 if nspace_tot==0 & nchar_tot==2

// blank first names
replace name2=othername2 if name2=="" & namefrst=="" & othername2!=""
replace name3=othername3 if name3=="" & namefrst=="" & othername3!=""

// fill in missing values
replace name3="." if nchar_tot<3
replace name4="." if nchar_tot<4

// moving on to 3 names
// two syllable first, one syllable last
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==3
// one syllable first, two syllable last (first char)
replace name2=othername1 if name2=="" & nchar_frst==1 & nchar_tot==3
// one syllable first, two syllable last (second char)
replace name3=othername2 if name3=="" & nchar_frst==1 & nchar_tot==3
replace name3="." if name3=="" & (name2=="SOOHOO" | name2=="OWYANG")

// 4 names
// two and two (first)
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==4
// two and two (second)
replace name4=othername2 if name4=="" & nchar_frst==2 & nchar_tot==4
// three plus one
replace name4=othername1 if name4=="" & nchar_frst==3 & nchar_tot==4
// one plus three
replace name2=othername1 if name2=="" & nchar_frst==1 & nchar_tot==4
replace name3=othername2 if name3=="" & nchar_frst==1 & nchar_tot==4
replace name4=othername3 if name4=="" & nchar_frst==1 & nchar_tot==4
replace name4="." if (name3=="SOOHOO" | name3=="OWYANG")
replace name2="OWYANG" if name1=="OWYANG" & name3=="OWYANG"
replace name3="" if name1=="OWYANG" & name3=="OWYANG"

// take out periods
replace name1 = subinstr(name1, ".", "",.)
replace name2 = subinstr(name2, ".", "",.)  

replace name1="AH" if name1=="A"
replace name2=name4 if name2=="." & name3=="."
replace name4="." if name2==name4 & name3=="."

replace name2="." if name2==""
replace name1="." if name1==""

order name1-name4 namefrst namelast othername1-othername3
sort name1 name2 name3 name4 
save "$data/1910_chn_finalnames.dta", replace

	global A "$data/1900_chn_finalnames.dta" 				
	global B "$data/1910_chn_finalnames.dta"	
	global match_vars name1 name2 name3 name4 // census names as written
	global timediff = (1910 - 1900)
	
	frame change default
	clear
	cd "$data"
	abematch $match_vars, file_A($A) file_B($B) timevar(age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(histid_1900) keep_B(histid_1910)
	drop timediff* age
	ren *_A *
	ren *_B *
	gen namefrst="."
	gen namelast="."
	gen step = "1" // identify this came after segmentation. N=298
	ren * _*

// Merge and identify duplicates between steps
// Prioritize step zero matches (they matched before any cleaning)
// Drop duplicates by both years' histid's. Merge vars will show non-unique matches.

	frameappend matches
	bys _histid_1900: gen dupes = _N
	drop if dupes==2 & _step=="1" // n=3759
	drop dupes
	bys _histid_1910: gen dupes = _N
	drop if dupes==2 & _step=="1" // n=130
	drop dupes
	frame copy default matches, replace

// Create working dataset for continued use
// Remove those already matched from these base data
	
   *1900*
	frame change default
	use "$data/1900_chn_finalnames.dta", clear
	ren * _*
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==.
	frame copy default f1900, replace
	frame f1900: keep _name1 _name2 _name3 _name4 _histid_1900 _age

   *1910*
   	frame change default
	use "$data/1910_chn_finalnames.dta", clear
	ren * _*
	frlink 1:1 _histid_1910, frame(matches)
	keep if matches==. 
	frame copy default f1910, replace
	frame f1910: keep _name1 _name2 _name3 _name4 _histid_1910 _age


************************************* 
************* STEP TWO **************
********** SWAP NAME ORDER **********
*************************************

	frame copy f1900 f1900_2, replace
	frame f1900_2: keep if _name3=="."

	frame copy f1910 f1910_2, replace
	frame f1910_2: keep if _name3=="."

// Change the name order in ONE of these working datasets
// Convert the old name1 to name2 and vice versa
	frame f1900_2: gen _name1_swap = _name2
	frame f1900_2: replace _name2 = _name1
	frame f1900_2: drop _name1
	frame f1900_2: ren _name1_swap _name1

// Save working data
// abematch command requires saved files
	frame change f1900_2
	save "$working/f1900_2.dta", replace
	frame change f1910_2
	save "$working/f1910_2.dta", replace
	frame change default

// Standard ABE match with swapped two-part names.
	global A "$working/f1900_2.dta"			
	global B "$working/f1910_2.dta"
	global match_vars _name1 _name2 // two-part names only
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1900) keep_B(_histid_1910)
	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	gen _namefrst="."
	gen _namelast="."
	gen _step = "2.1" // identify this came after swapping 2-part name order. N=2076

// Add to match dataset.
	frameappend matches
	frame copy default matches, replace

// Remove step 2.1 matches from OVERALL working data
	frame change f1900
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==.
	drop matches

	frame change f1910
	frlink 1:1 _histid_1910, frame(matches)
	keep if matches==. 
	drop matches

	
 * Create subset datasets with just three-part names
	frame change default
	frame copy f1900 f1900_3, replace
	frame f1900_3: keep if _name3!="." & _name4=="."
	frame copy f1910 f1910_3, replace
	frame f1910_3: keep if _name3!="." & _name4=="."
	
// Change the name order in ONE of these working datasets
// name1 and name2 stick together - "given" name
// name1 -> name3; name2 -> name1; name3 -> name2
// E.g. Deng Xiao Ping -> Xiao Ping Deng
	frame f1900_3: gen name1_swap = _name1
	frame f1900_3: replace _name1 = _name2
	frame f1900_3: replace _name2 = _name3
	frame f1900_3: replace _name3 = name1_swap
	frame f1900_3: drop name1_swap

// Save working data
// abematch command requires saved files
	frame change f1900_3
	save "$working/f1900_3.dta", replace
	frame change f1910_3
	save "$working/f1910_3.dta", replace

// Standard ABE match with swapped three-part names. N=11.
	global A "$working/f1900_3.dta"			
	global B "$working/f1910_3.dta"
	global match_vars _name1 _name2 _name3 // three-part names only

	frame change default
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1900) keep_B(_histid_1910)

	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	gen _namefrst="."
	gen _namelast="."
	gen _step = "2.2" // identify this came after 3-part name swaps

	frameappend matches
	frame copy default matches, replace

// Remove step 2.2 matches from OVERALL working data
	frame change f1900
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==.
	drop matches

	frame change f1910
	frlink 1:1 _histid_1910, frame(matches)
	keep if matches==.
	drop matches

************************************* 
************ STEP THREE *************
********** STANDARDIZATION **********
*************************************	

// NYC Exclusion Index crosswalk
  *1900
	frame copy f1900 f1900_std, replace
	frame change f1900_std
	frame f1900_std: ren _name1 name1
	merge m:1 name1 using "$data/crosswalk_final_wide.dta" // 41,314
	keep if _merge!=2
	drop _merge
	replace name1=char1_utf if char1_prop>=.7 & char1_prop!=. // 24,321
	save "$working/f1900_std.dta", replace
	
  *1910
	frame copy f1910 f1910_std, replace
	frame change f1910_std
	frame f1910_std: ren _name1 name1
	merge m:1 name1 using "$data/crosswalk_final_wide.dta" // 27,123
	keep if _merge!=2
	drop _merge
	replace name1=char1_utf if char1_prop>=.7 & char1_prop!=. // 16,732
	save "$working/f1910_std.dta", replace

// Standard ABE match with standardized names in correct order. N=589.	
	global A "$working/f1900_std.dta"			
	global B "$working/f1910_std.dta"
	global match_vars name1 _name2 _name3 _name4

	frame change default
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1900) keep_B(_histid_1910)
	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	ren name1 _name1
	gen _namefrst="."
	gen _namelast="."
	gen _step = "3.0" 
	
	frameappend matches
	frame copy default matches, replace
	frame change matches

	replace unique_file1 = _unique_file1 if unique_file1==.
	replace unique_file2 = _unique_file2 if unique_file2==.
	replace unique_match1 = _unique_match1 if unique_match1==.
	replace unique_match2 = _unique_match2 if unique_match2==.
	drop _unique*
	drop _name4
	ren _* *
	
	save "$data/matches 1900-10.dta", replace
	
	
	
	
